diff --git a/apps/api/main.py b/apps/api/main.py index 8258308..eb1275f 100644 --- a/apps/api/main.py +++ b/apps/api/main.py @@ -1927,15 +1927,18 @@ async def system_status_v075(db=Depends(get_db)): except Exception: components["payments"] = "outage" - # uptime_30d / incidents: there is NO platform-uptime measurement or incident - # history behind these yet. The previous code claimed a hardcoded 99.97 and an - # empty incidents list — a fabricated "99.97% uptime, zero incidents" that was - # false (the prior query referenced service_probes.outcome/created_at columns - # that do not exist, so it errored every call and always returned the literal). - # Report null/unmeasured until a real source exists (see status_checks recorder - # follow-up). `status` + `components` above ARE real, live-measured signals. - uptime_30d = None - incidents = None + # uptime_30d / incidents: real values from UptimeRobot (the chosen source), + # cached ~5 min. NO hardcoded number — the previous code returned a fabricated + # 99.97 + empty incidents (its query hit service_probes.outcome/created_at, + # columns that don't exist, so it errored every call and fell back to the + # literal). When UptimeRobot is unconfigured/unreachable this returns + # uptime_30d=None / incidents=None ("unmeasured") — never a fabricated value. + # `status` + `components` above are independent, live-measured signals. + from services.uptime import get_uptime_snapshot + _uptime = await get_uptime_snapshot() + uptime_30d = _uptime["uptime_30d"] + incidents = _uptime["incidents"] + uptime_source = _uptime["uptime_source"] # Overall rollup: # "outage" = only when the api component itself is unreachable (gateway down). @@ -1953,9 +1956,9 @@ async def system_status_v075(db=Depends(get_db)): return { "status": overall, "components": components, - "uptime_30d": uptime_30d, # null until real uptime is instrumented - "uptime_source": "unmeasured", # no platform-uptime history source yet - "incidents": incidents, # null = unmeasured (NOT an empty list / "zero incidents") + "uptime_30d": uptime_30d, # real (UptimeRobot) or null when unmeasured + "uptime_source": uptime_source, # "uptimerobot" | "unmeasured" + "incidents": incidents, # real down-events, or null when unmeasured (never a fake []) } diff --git a/apps/api/pytest.ini b/apps/api/pytest.ini index 372dcc2..1398b03 100644 --- a/apps/api/pytest.ini +++ b/apps/api/pytest.ini @@ -11,5 +11,6 @@ python_files = test_suite_v060.py test_suite_v062.py test_suite_v0610.py test_se test_tier3_provider_gating.py test_x402_client.py test_rails_flag.py + test_uptime_source.py markers = no_api_key: test does not require WAYFORTH_TEST_API_KEY (e.g. probes unauthenticated paths) diff --git a/apps/api/services/uptime.py b/apps/api/services/uptime.py new file mode 100644 index 0000000..2da2294 --- /dev/null +++ b/apps/api/services/uptime.py @@ -0,0 +1,147 @@ +"""services/uptime.py — real uptime + incidents from UptimeRobot. + +The status page's uptime number used to be a hardcoded 99.97 with an empty +incidents list (fabricated). The real source is UptimeRobot: this module calls +its getMonitors API with a READ-ONLY key, reads the Wayforth Gateway monitor's +30-day `custom_uptime_ratio`, and derives incidents from the monitor's event log +(real down events — e.g. a rank-service outage shows up, instead of a false +"none in 90 days"). + +Result is cached in-process for ~5 minutes so the public /system/status endpoint +never hammers UptimeRobot. Every failure mode falls back to "unmeasured" +(uptime=None, incidents=None) — we NEVER fabricate a number. + +Config (backend env): + UPTIMEROBOT_API_KEY — read-only API key (also accepts UPTIMEROBOT_READ_ONLY_KEY) + UPTIMEROBOT_MONITOR_ID — optional: pin the exact monitor; otherwise the + monitor whose name contains "gateway"/"wayforth" is + used, else the first monitor returned. +""" +from __future__ import annotations + +import logging +import os +import time +from datetime import datetime, timezone + +import httpx + +logger = logging.getLogger("wayforth") + +_API_URL = "https://api.uptimerobot.com/v2/getMonitors" +_CACHE_TTL = 300 # seconds (~5 min) +_HTTP_TIMEOUT = 6.0 + +# In-process cache: {"at": epoch, "value": snapshot} +_cache: dict = {} + + +def _api_key() -> str: + return ( + os.environ.get("UPTIMEROBOT_API_KEY") + or os.environ.get("UPTIMEROBOT_READ_ONLY_KEY") + or "" + ) + + +def _unmeasured() -> dict: + return {"uptime_30d": None, "uptime_source": "unmeasured", "incidents": None} + + +def _pick_monitor(monitors: list[dict]) -> dict | None: + if not monitors: + return None + pinned = os.environ.get("UPTIMEROBOT_MONITOR_ID", "").strip() + if pinned: + for m in monitors: + if str(m.get("id")) == pinned: + return m + for m in monitors: # prefer the gateway/wayforth monitor by name + name = (m.get("friendly_name") or "").lower() + if "gateway" in name or "wayforth" in name: + return m + return monitors[0] + + +def _incidents_from_logs(monitor: dict, *, limit: int = 10) -> list[dict]: + """Map UptimeRobot down-events (log type 1) to incident records.""" + incidents = [] + for log in monitor.get("logs", []) or []: + if log.get("type") != 1: # 1 = down (2 = up, 99 = paused) + continue + ts = log.get("datetime") + started = ( + datetime.fromtimestamp(int(ts), tz=timezone.utc).isoformat() + if ts else None + ) + reason = log.get("reason") or {} + incidents.append({ + "started_at": started, + "duration_seconds": int(log.get("duration") or 0), + "reason": reason.get("detail") or reason.get("code") or "down", + }) + incidents.sort(key=lambda i: i.get("started_at") or "", reverse=True) + return incidents[:limit] + + +async def get_uptime_snapshot() -> dict: + """Return {uptime_30d, uptime_source, incidents}. Cached ~5 min, fail-safe. + + Falls back to unmeasured (None) on any error or when no key is configured — + never fabricates a value. + """ + key = _api_key() + if not key: + return _unmeasured() + + now = time.time() + cached = _cache.get("value") + if cached is not None and (now - _cache.get("at", 0)) < _CACHE_TTL: + return cached + + try: + async with httpx.AsyncClient(timeout=_HTTP_TIMEOUT) as client: + resp = await client.post( + _API_URL, + data={ + "api_key": key, + "format": "json", + "custom_uptime_ratios": "30", # 30-day ratio + "logs": "1", + }, + headers={"Cache-Control": "no-cache"}, + ) + data = resp.json() + if data.get("stat") != "ok": + logger.warning("uptimerobot getMonitors not ok: %s", data.get("error")) + return _cache_and_return(_unmeasured(), now) + + monitor = _pick_monitor(data.get("monitors", [])) + if not monitor: + return _cache_and_return(_unmeasured(), now) + + ratio_raw = monitor.get("custom_uptime_ratio") + # custom_uptime_ratio is a string like "99.950" for the single window. + uptime = None + if ratio_raw not in (None, ""): + try: + uptime = round(float(str(ratio_raw).split("-")[0]), 3) + except (ValueError, TypeError): + uptime = None + + snapshot = { + "uptime_30d": uptime, + "uptime_source": "uptimerobot" if uptime is not None else "unmeasured", + "incidents": _incidents_from_logs(monitor), + } + return _cache_and_return(snapshot, now) + except Exception as exc: + logger.warning("uptimerobot fetch failed: %s", exc) + # Serve a stale cached value if we have one; else unmeasured. + return cached if cached is not None else _unmeasured() + + +def _cache_and_return(snapshot: dict, now: float) -> dict: + _cache["value"] = snapshot + _cache["at"] = now + return snapshot diff --git a/apps/api/tests/test_uptime_source.py b/apps/api/tests/test_uptime_source.py new file mode 100644 index 0000000..46f70a0 --- /dev/null +++ b/apps/api/tests/test_uptime_source.py @@ -0,0 +1,97 @@ +"""test_uptime_source.py — UptimeRobot uptime/incidents client (offline). + +Locks in the honesty contract: no key or any failure → unmeasured (None), NEVER a +fabricated number; a real getMonitors payload → real ratio + real down-events. +""" +from __future__ import annotations + +import asyncio + +import pytest + +from services import uptime + + +@pytest.fixture(autouse=True) +def _clean(monkeypatch): + for v in ("UPTIMEROBOT_API_KEY", "UPTIMEROBOT_READ_ONLY_KEY", "UPTIMEROBOT_MONITOR_ID"): + monkeypatch.delenv(v, raising=False) + uptime._cache.clear() + yield + uptime._cache.clear() + + +def test_no_key_is_unmeasured(): + snap = asyncio.run(uptime.get_uptime_snapshot()) + assert snap == {"uptime_30d": None, "uptime_source": "unmeasured", "incidents": None} + + +def test_pick_monitor_prefers_gateway(): + mons = [{"id": 1, "friendly_name": "Docs site"}, + {"id": 2, "friendly_name": "Wayforth Gateway"}] + assert uptime._pick_monitor(mons)["id"] == 2 + + +def test_pick_monitor_honors_pinned_id(monkeypatch): + monkeypatch.setenv("UPTIMEROBOT_MONITOR_ID", "1") + mons = [{"id": 1, "friendly_name": "Docs site"}, + {"id": 2, "friendly_name": "Wayforth Gateway"}] + assert uptime._pick_monitor(mons)["id"] == 1 + + +def test_incidents_only_down_events(): + mon = {"logs": [ + {"type": 1, "datetime": 1718000000, "duration": 540, + "reason": {"code": "503", "detail": "rank-service down"}}, + {"type": 2, "datetime": 1718000540, "duration": 0, "reason": {"code": "200"}}, + {"type": 99, "datetime": 1717000000, "duration": 0, "reason": {}}, + ]} + inc = uptime._incidents_from_logs(mon) + assert len(inc) == 1 + assert inc[0]["reason"] == "rank-service down" + assert inc[0]["duration_seconds"] == 540 + assert inc[0]["started_at"].endswith("+00:00") + + +def test_live_payload_parses(monkeypatch): + """A mocked getMonitors response yields a real ratio + incidents (no network).""" + monkeypatch.setenv("UPTIMEROBOT_API_KEY", "ro-test-key") + + class _Resp: + def json(self): + return {"stat": "ok", "monitors": [{ + "id": 777, "friendly_name": "Wayforth Gateway", "status": 2, + "custom_uptime_ratio": "99.231", + "logs": [{"type": 1, "datetime": 1718000000, "duration": 60, + "reason": {"detail": "timeout"}}], + }]} + + class _Client: + def __init__(self, *a, **k): pass + async def __aenter__(self): return self + async def __aexit__(self, *a): return False + async def post(self, *a, **k): return _Resp() + + monkeypatch.setattr(uptime.httpx, "AsyncClient", _Client) + snap = asyncio.run(uptime.get_uptime_snapshot()) + assert snap["uptime_30d"] == 99.231 + assert snap["uptime_source"] == "uptimerobot" + assert len(snap["incidents"]) == 1 and snap["incidents"][0]["reason"] == "timeout" + + +def test_api_error_falls_back_unmeasured(monkeypatch): + monkeypatch.setenv("UPTIMEROBOT_API_KEY", "ro-test-key") + + class _Resp: + def json(self): return {"stat": "fail", "error": {"message": "bad key"}} + + class _Client: + def __init__(self, *a, **k): pass + async def __aenter__(self): return self + async def __aexit__(self, *a): return False + async def post(self, *a, **k): return _Resp() + + monkeypatch.setattr(uptime.httpx, "AsyncClient", _Client) + snap = asyncio.run(uptime.get_uptime_snapshot()) + assert snap["uptime_30d"] is None + assert snap["uptime_source"] == "unmeasured"