WayforthOfficial · WayforthOfficial · Jun 20, 2026 · Jun 20, 2026
diff --git a/apps/api/main.py b/apps/api/main.py
@@ -1927,15 +1927,18 @@ async def system_status_v075(db=Depends(get_db)):
     except Exception:
         components["payments"] = "outage"
 
-    # uptime_30d / incidents: there is NO platform-uptime measurement or incident
-    # history behind these yet. The previous code claimed a hardcoded 99.97 and an
-    # empty incidents list — a fabricated "99.97% uptime, zero incidents" that was
-    # false (the prior query referenced service_probes.outcome/created_at columns
-    # that do not exist, so it errored every call and always returned the literal).
-    # Report null/unmeasured until a real source exists (see status_checks recorder
-    # follow-up). `status` + `components` above ARE real, live-measured signals.
-    uptime_30d = None
-    incidents = None
+    # uptime_30d / incidents: real values from UptimeRobot (the chosen source),
+    # cached ~5 min. NO hardcoded number — the previous code returned a fabricated
+    # 99.97 + empty incidents (its query hit service_probes.outcome/created_at,
+    # columns that don't exist, so it errored every call and fell back to the
+    # literal). When UptimeRobot is unconfigured/unreachable this returns
+    # uptime_30d=None / incidents=None ("unmeasured") — never a fabricated value.
+    # `status` + `components` above are independent, live-measured signals.
+    from services.uptime import get_uptime_snapshot
+    _uptime = await get_uptime_snapshot()
+    uptime_30d = _uptime["uptime_30d"]
+    incidents = _uptime["incidents"]
+    uptime_source = _uptime["uptime_source"]
 
     # Overall rollup:
     # "outage" = only when the api component itself is unreachable (gateway down).
@@ -1953,9 +1956,9 @@ async def system_status_v075(db=Depends(get_db)):
     return {
         "status": overall,
         "components": components,
-        "uptime_30d": uptime_30d,        # null until real uptime is instrumented
-        "uptime_source": "unmeasured",   # no platform-uptime history source yet
-        "incidents": incidents,          # null = unmeasured (NOT an empty list / "zero incidents")
+        "uptime_30d": uptime_30d,        # real (UptimeRobot) or null when unmeasured
+        "uptime_source": uptime_source,  # "uptimerobot" | "unmeasured"
+        "incidents": incidents,          # real down-events, or null when unmeasured (never a fake [])
     }
 
 

diff --git a/apps/api/pytest.ini b/apps/api/pytest.ini
@@ -11,5 +11,6 @@ python_files = test_suite_v060.py test_suite_v062.py test_suite_v0610.py test_se
     test_tier3_provider_gating.py
     test_x402_client.py
     test_rails_flag.py
+    test_uptime_source.py
 markers =
     no_api_key: test does not require WAYFORTH_TEST_API_KEY (e.g. probes unauthenticated paths)
diff --git a/apps/api/services/uptime.py b/apps/api/services/uptime.py
@@ -0,0 +1,147 @@
+"""services/uptime.py — real uptime + incidents from UptimeRobot.
+
+The status page's uptime number used to be a hardcoded 99.97 with an empty
+incidents list (fabricated). The real source is UptimeRobot: this module calls
+its getMonitors API with a READ-ONLY key, reads the Wayforth Gateway monitor's
+30-day `custom_uptime_ratio`, and derives incidents from the monitor's event log
+(real down events — e.g. a rank-service outage shows up, instead of a false
+"none in 90 days").
+
+Result is cached in-process for ~5 minutes so the public /system/status endpoint
+never hammers UptimeRobot. Every failure mode falls back to "unmeasured"
+(uptime=None, incidents=None) — we NEVER fabricate a number.
+
+Config (backend env):
+  UPTIMEROBOT_API_KEY       — read-only API key (also accepts UPTIMEROBOT_READ_ONLY_KEY)
+  UPTIMEROBOT_MONITOR_ID    — optional: pin the exact monitor; otherwise the
+                              monitor whose name contains "gateway"/"wayforth" is
+                              used, else the first monitor returned.
+"""
+from __future__ import annotations
+
+import logging
+import os
+import time
+from datetime import datetime, timezone
+
+import httpx
+
+logger = logging.getLogger("wayforth")
+
+_API_URL = "https://api.uptimerobot.com/v2/getMonitors"
+_CACHE_TTL = 300  # seconds (~5 min)
+_HTTP_TIMEOUT = 6.0
+
+# In-process cache: {"at": epoch, "value": snapshot}
+_cache: dict = {}
+
+
+def _api_key() -> str:
+    return (
+        os.environ.get("UPTIMEROBOT_API_KEY")
+        or os.environ.get("UPTIMEROBOT_READ_ONLY_KEY")
+        or ""
+    )
+
+
+def _unmeasured() -> dict:
+    return {"uptime_30d": None, "uptime_source": "unmeasured", "incidents": None}
+
+
+def _pick_monitor(monitors: list[dict]) -> dict | None:
+    if not monitors:
+        return None
+    pinned = os.environ.get("UPTIMEROBOT_MONITOR_ID", "").strip()
+    if pinned:
+        for m in monitors:
+            if str(m.get("id")) == pinned:
+                return m
+    for m in monitors:  # prefer the gateway/wayforth monitor by name
+        name = (m.get("friendly_name") or "").lower()
+        if "gateway" in name or "wayforth" in name:
+            return m
+    return monitors[0]
+
+
+def _incidents_from_logs(monitor: dict, *, limit: int = 10) -> list[dict]:
+    """Map UptimeRobot down-events (log type 1) to incident records."""
+    incidents = []
+    for log in monitor.get("logs", []) or []:
+        if log.get("type") != 1:  # 1 = down (2 = up, 99 = paused)
+            continue
+        ts = log.get("datetime")
+        started = (
+            datetime.fromtimestamp(int(ts), tz=timezone.utc).isoformat()
+            if ts else None
+        )
+        reason = log.get("reason") or {}
+        incidents.append({
+            "started_at": started,
+            "duration_seconds": int(log.get("duration") or 0),
+            "reason": reason.get("detail") or reason.get("code") or "down",
+        })
+    incidents.sort(key=lambda i: i.get("started_at") or "", reverse=True)
+    return incidents[:limit]
+
+
+async def get_uptime_snapshot() -> dict:
+    """Return {uptime_30d, uptime_source, incidents}. Cached ~5 min, fail-safe.
+
+    Falls back to unmeasured (None) on any error or when no key is configured —
+    never fabricates a value.
+    """
+    key = _api_key()
+    if not key:
+        return _unmeasured()
+
+    now = time.time()
+    cached = _cache.get("value")
+    if cached is not None and (now - _cache.get("at", 0)) < _CACHE_TTL:
+        return cached
+
+    try:
+        async with httpx.AsyncClient(timeout=_HTTP_TIMEOUT) as client:
+            resp = await client.post(
+                _API_URL,
+                data={
+                    "api_key": key,
+                    "format": "json",
+                    "custom_uptime_ratios": "30",  # 30-day ratio
+                    "logs": "1",
+                },
+                headers={"Cache-Control": "no-cache"},
+            )
+        data = resp.json()
+        if data.get("stat") != "ok":
+            logger.warning("uptimerobot getMonitors not ok: %s", data.get("error"))
+            return _cache_and_return(_unmeasured(), now)
+
+        monitor = _pick_monitor(data.get("monitors", []))
+        if not monitor:
+            return _cache_and_return(_unmeasured(), now)
+
+        ratio_raw = monitor.get("custom_uptime_ratio")
+        # custom_uptime_ratio is a string like "99.950" for the single window.
+        uptime = None
+        if ratio_raw not in (None, ""):
+            try:
+                uptime = round(float(str(ratio_raw).split("-")[0]), 3)
+            except (ValueError, TypeError):
+                uptime = None
+
+        snapshot = {
+            "uptime_30d": uptime,
+            "uptime_source": "uptimerobot" if uptime is not None else "unmeasured",
+            "incidents": _incidents_from_logs(monitor),
+        }
+        return _cache_and_return(snapshot, now)
+    except Exception as exc:
+        logger.warning("uptimerobot fetch failed: %s", exc)
+        # Serve a stale cached value if we have one; else unmeasured.
+        return cached if cached is not None else _unmeasured()
+
+
+def _cache_and_return(snapshot: dict, now: float) -> dict:
+    _cache["value"] = snapshot
+    _cache["at"] = now
+    return snapshot
diff --git a/apps/api/tests/test_uptime_source.py b/apps/api/tests/test_uptime_source.py
@@ -0,0 +1,97 @@
+"""test_uptime_source.py — UptimeRobot uptime/incidents client (offline).
+
+Locks in the honesty contract: no key or any failure → unmeasured (None), NEVER a
+fabricated number; a real getMonitors payload → real ratio + real down-events.
+"""
+from __future__ import annotations
+
+import asyncio
+
+import pytest
+
+from services import uptime
+
+
+@pytest.fixture(autouse=True)
+def _clean(monkeypatch):
+    for v in ("UPTIMEROBOT_API_KEY", "UPTIMEROBOT_READ_ONLY_KEY", "UPTIMEROBOT_MONITOR_ID"):
+        monkeypatch.delenv(v, raising=False)
+    uptime._cache.clear()
+    yield
+    uptime._cache.clear()
+
+
+def test_no_key_is_unmeasured():
+    snap = asyncio.run(uptime.get_uptime_snapshot())
+    assert snap == {"uptime_30d": None, "uptime_source": "unmeasured", "incidents": None}
+
+
+def test_pick_monitor_prefers_gateway():
+    mons = [{"id": 1, "friendly_name": "Docs site"},
+            {"id": 2, "friendly_name": "Wayforth Gateway"}]
+    assert uptime._pick_monitor(mons)["id"] == 2
+
+
+def test_pick_monitor_honors_pinned_id(monkeypatch):
+    monkeypatch.setenv("UPTIMEROBOT_MONITOR_ID", "1")
+    mons = [{"id": 1, "friendly_name": "Docs site"},
+            {"id": 2, "friendly_name": "Wayforth Gateway"}]
+    assert uptime._pick_monitor(mons)["id"] == 1
+
+
+def test_incidents_only_down_events():
+    mon = {"logs": [
+        {"type": 1, "datetime": 1718000000, "duration": 540,
+         "reason": {"code": "503", "detail": "rank-service down"}},
+        {"type": 2, "datetime": 1718000540, "duration": 0, "reason": {"code": "200"}},
+        {"type": 99, "datetime": 1717000000, "duration": 0, "reason": {}},
+    ]}
+    inc = uptime._incidents_from_logs(mon)
+    assert len(inc) == 1
+    assert inc[0]["reason"] == "rank-service down"
+    assert inc[0]["duration_seconds"] == 540
+    assert inc[0]["started_at"].endswith("+00:00")
+
+
+def test_live_payload_parses(monkeypatch):
+    """A mocked getMonitors response yields a real ratio + incidents (no network)."""
+    monkeypatch.setenv("UPTIMEROBOT_API_KEY", "ro-test-key")
+
+    class _Resp:
+        def json(self):
+            return {"stat": "ok", "monitors": [{
+                "id": 777, "friendly_name": "Wayforth Gateway", "status": 2,
+                "custom_uptime_ratio": "99.231",
+                "logs": [{"type": 1, "datetime": 1718000000, "duration": 60,
+                          "reason": {"detail": "timeout"}}],
+            }]}
+
+    class _Client:
+        def __init__(self, *a, **k): pass
+        async def __aenter__(self): return self
+        async def __aexit__(self, *a): return False
+        async def post(self, *a, **k): return _Resp()
+
+    monkeypatch.setattr(uptime.httpx, "AsyncClient", _Client)
+    snap = asyncio.run(uptime.get_uptime_snapshot())
+    assert snap["uptime_30d"] == 99.231
+    assert snap["uptime_source"] == "uptimerobot"
+    assert len(snap["incidents"]) == 1 and snap["incidents"][0]["reason"] == "timeout"
+
+
+def test_api_error_falls_back_unmeasured(monkeypatch):
+    monkeypatch.setenv("UPTIMEROBOT_API_KEY", "ro-test-key")
+
+    class _Resp:
+        def json(self): return {"stat": "fail", "error": {"message": "bad key"}}
+
+    class _Client:
+        def __init__(self, *a, **k): pass
+        async def __aenter__(self): return self
+        async def __aexit__(self, *a): return False
+        async def post(self, *a, **k): return _Resp()
+
+    monkeypatch.setattr(uptime.httpx, "AsyncClient", _Client)
+    snap = asyncio.run(uptime.get_uptime_snapshot())
+    assert snap["uptime_30d"] is None
+    assert snap["uptime_source"] == "unmeasured"