Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 15 additions & 12 deletions apps/api/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -1927,15 +1927,18 @@ async def system_status_v075(db=Depends(get_db)):
except Exception:
components["payments"] = "outage"

# uptime_30d / incidents: there is NO platform-uptime measurement or incident
# history behind these yet. The previous code claimed a hardcoded 99.97 and an
# empty incidents list — a fabricated "99.97% uptime, zero incidents" that was
# false (the prior query referenced service_probes.outcome/created_at columns
# that do not exist, so it errored every call and always returned the literal).
# Report null/unmeasured until a real source exists (see status_checks recorder
# follow-up). `status` + `components` above ARE real, live-measured signals.
uptime_30d = None
incidents = None
# uptime_30d / incidents: real values from UptimeRobot (the chosen source),
# cached ~5 min. NO hardcoded number — the previous code returned a fabricated
# 99.97 + empty incidents (its query hit service_probes.outcome/created_at,
# columns that don't exist, so it errored every call and fell back to the
# literal). When UptimeRobot is unconfigured/unreachable this returns
# uptime_30d=None / incidents=None ("unmeasured") — never a fabricated value.
# `status` + `components` above are independent, live-measured signals.
from services.uptime import get_uptime_snapshot
_uptime = await get_uptime_snapshot()
uptime_30d = _uptime["uptime_30d"]
incidents = _uptime["incidents"]
uptime_source = _uptime["uptime_source"]

# Overall rollup:
# "outage" = only when the api component itself is unreachable (gateway down).
Expand All @@ -1953,9 +1956,9 @@ async def system_status_v075(db=Depends(get_db)):
return {
"status": overall,
"components": components,
"uptime_30d": uptime_30d, # null until real uptime is instrumented
"uptime_source": "unmeasured", # no platform-uptime history source yet
"incidents": incidents, # null = unmeasured (NOT an empty list / "zero incidents")
"uptime_30d": uptime_30d, # real (UptimeRobot) or null when unmeasured
"uptime_source": uptime_source, # "uptimerobot" | "unmeasured"
"incidents": incidents, # real down-events, or null when unmeasured (never a fake [])
}


Expand Down
1 change: 1 addition & 0 deletions apps/api/pytest.ini
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,6 @@ python_files = test_suite_v060.py test_suite_v062.py test_suite_v0610.py test_se
test_tier3_provider_gating.py
test_x402_client.py
test_rails_flag.py
test_uptime_source.py
markers =
no_api_key: test does not require WAYFORTH_TEST_API_KEY (e.g. probes unauthenticated paths)
147 changes: 147 additions & 0 deletions apps/api/services/uptime.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
"""services/uptime.py — real uptime + incidents from UptimeRobot.

The status page's uptime number used to be a hardcoded 99.97 with an empty
incidents list (fabricated). The real source is UptimeRobot: this module calls
its getMonitors API with a READ-ONLY key, reads the Wayforth Gateway monitor's
30-day `custom_uptime_ratio`, and derives incidents from the monitor's event log
(real down events — e.g. a rank-service outage shows up, instead of a false
"none in 90 days").

Result is cached in-process for ~5 minutes so the public /system/status endpoint
never hammers UptimeRobot. Every failure mode falls back to "unmeasured"
(uptime=None, incidents=None) — we NEVER fabricate a number.

Config (backend env):
UPTIMEROBOT_API_KEY — read-only API key (also accepts UPTIMEROBOT_READ_ONLY_KEY)
UPTIMEROBOT_MONITOR_ID — optional: pin the exact monitor; otherwise the
monitor whose name contains "gateway"/"wayforth" is
used, else the first monitor returned.
"""
from __future__ import annotations

import logging
import os
import time
from datetime import datetime, timezone

import httpx

logger = logging.getLogger("wayforth")

_API_URL = "https://api.uptimerobot.com/v2/getMonitors"
_CACHE_TTL = 300 # seconds (~5 min)
_HTTP_TIMEOUT = 6.0

# In-process cache: {"at": epoch, "value": snapshot}
_cache: dict = {}


def _api_key() -> str:
return (
os.environ.get("UPTIMEROBOT_API_KEY")
or os.environ.get("UPTIMEROBOT_READ_ONLY_KEY")
or ""
)


def _unmeasured() -> dict:
return {"uptime_30d": None, "uptime_source": "unmeasured", "incidents": None}


def _pick_monitor(monitors: list[dict]) -> dict | None:
if not monitors:
return None
pinned = os.environ.get("UPTIMEROBOT_MONITOR_ID", "").strip()
if pinned:
for m in monitors:
if str(m.get("id")) == pinned:
return m
for m in monitors: # prefer the gateway/wayforth monitor by name
name = (m.get("friendly_name") or "").lower()
if "gateway" in name or "wayforth" in name:
return m
return monitors[0]


def _incidents_from_logs(monitor: dict, *, limit: int = 10) -> list[dict]:
"""Map UptimeRobot down-events (log type 1) to incident records."""
incidents = []
for log in monitor.get("logs", []) or []:
if log.get("type") != 1: # 1 = down (2 = up, 99 = paused)
continue
ts = log.get("datetime")
started = (
datetime.fromtimestamp(int(ts), tz=timezone.utc).isoformat()
if ts else None
)
reason = log.get("reason") or {}
incidents.append({
"started_at": started,
"duration_seconds": int(log.get("duration") or 0),
"reason": reason.get("detail") or reason.get("code") or "down",
})
incidents.sort(key=lambda i: i.get("started_at") or "", reverse=True)
return incidents[:limit]


async def get_uptime_snapshot() -> dict:
"""Return {uptime_30d, uptime_source, incidents}. Cached ~5 min, fail-safe.

Falls back to unmeasured (None) on any error or when no key is configured —
never fabricates a value.
"""
key = _api_key()
if not key:
return _unmeasured()

now = time.time()
cached = _cache.get("value")
if cached is not None and (now - _cache.get("at", 0)) < _CACHE_TTL:
return cached

try:
async with httpx.AsyncClient(timeout=_HTTP_TIMEOUT) as client:
resp = await client.post(
_API_URL,
data={
"api_key": key,
"format": "json",
"custom_uptime_ratios": "30", # 30-day ratio
"logs": "1",
},
headers={"Cache-Control": "no-cache"},
)
data = resp.json()
if data.get("stat") != "ok":
logger.warning("uptimerobot getMonitors not ok: %s", data.get("error"))
return _cache_and_return(_unmeasured(), now)

monitor = _pick_monitor(data.get("monitors", []))
if not monitor:
return _cache_and_return(_unmeasured(), now)

ratio_raw = monitor.get("custom_uptime_ratio")
# custom_uptime_ratio is a string like "99.950" for the single window.
uptime = None
if ratio_raw not in (None, ""):
try:
uptime = round(float(str(ratio_raw).split("-")[0]), 3)
except (ValueError, TypeError):
uptime = None

snapshot = {
"uptime_30d": uptime,
"uptime_source": "uptimerobot" if uptime is not None else "unmeasured",
"incidents": _incidents_from_logs(monitor),
}
return _cache_and_return(snapshot, now)
except Exception as exc:
logger.warning("uptimerobot fetch failed: %s", exc)
# Serve a stale cached value if we have one; else unmeasured.
return cached if cached is not None else _unmeasured()


def _cache_and_return(snapshot: dict, now: float) -> dict:
_cache["value"] = snapshot
_cache["at"] = now
return snapshot
97 changes: 97 additions & 0 deletions apps/api/tests/test_uptime_source.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
"""test_uptime_source.py — UptimeRobot uptime/incidents client (offline).

Locks in the honesty contract: no key or any failure → unmeasured (None), NEVER a
fabricated number; a real getMonitors payload → real ratio + real down-events.
"""
from __future__ import annotations

import asyncio

import pytest

from services import uptime


@pytest.fixture(autouse=True)
def _clean(monkeypatch):
for v in ("UPTIMEROBOT_API_KEY", "UPTIMEROBOT_READ_ONLY_KEY", "UPTIMEROBOT_MONITOR_ID"):
monkeypatch.delenv(v, raising=False)
uptime._cache.clear()
yield
uptime._cache.clear()


def test_no_key_is_unmeasured():
snap = asyncio.run(uptime.get_uptime_snapshot())
assert snap == {"uptime_30d": None, "uptime_source": "unmeasured", "incidents": None}


def test_pick_monitor_prefers_gateway():
mons = [{"id": 1, "friendly_name": "Docs site"},
{"id": 2, "friendly_name": "Wayforth Gateway"}]
assert uptime._pick_monitor(mons)["id"] == 2


def test_pick_monitor_honors_pinned_id(monkeypatch):
monkeypatch.setenv("UPTIMEROBOT_MONITOR_ID", "1")
mons = [{"id": 1, "friendly_name": "Docs site"},
{"id": 2, "friendly_name": "Wayforth Gateway"}]
assert uptime._pick_monitor(mons)["id"] == 1


def test_incidents_only_down_events():
mon = {"logs": [
{"type": 1, "datetime": 1718000000, "duration": 540,
"reason": {"code": "503", "detail": "rank-service down"}},
{"type": 2, "datetime": 1718000540, "duration": 0, "reason": {"code": "200"}},
{"type": 99, "datetime": 1717000000, "duration": 0, "reason": {}},
]}
inc = uptime._incidents_from_logs(mon)
assert len(inc) == 1
assert inc[0]["reason"] == "rank-service down"
assert inc[0]["duration_seconds"] == 540
assert inc[0]["started_at"].endswith("+00:00")


def test_live_payload_parses(monkeypatch):
"""A mocked getMonitors response yields a real ratio + incidents (no network)."""
monkeypatch.setenv("UPTIMEROBOT_API_KEY", "ro-test-key")

class _Resp:
def json(self):
return {"stat": "ok", "monitors": [{
"id": 777, "friendly_name": "Wayforth Gateway", "status": 2,
"custom_uptime_ratio": "99.231",
"logs": [{"type": 1, "datetime": 1718000000, "duration": 60,
"reason": {"detail": "timeout"}}],
}]}

class _Client:
def __init__(self, *a, **k): pass
async def __aenter__(self): return self
async def __aexit__(self, *a): return False
async def post(self, *a, **k): return _Resp()

monkeypatch.setattr(uptime.httpx, "AsyncClient", _Client)
snap = asyncio.run(uptime.get_uptime_snapshot())
assert snap["uptime_30d"] == 99.231
assert snap["uptime_source"] == "uptimerobot"
assert len(snap["incidents"]) == 1 and snap["incidents"][0]["reason"] == "timeout"


def test_api_error_falls_back_unmeasured(monkeypatch):
monkeypatch.setenv("UPTIMEROBOT_API_KEY", "ro-test-key")

class _Resp:
def json(self): return {"stat": "fail", "error": {"message": "bad key"}}

class _Client:
def __init__(self, *a, **k): pass
async def __aenter__(self): return self
async def __aexit__(self, *a): return False
async def post(self, *a, **k): return _Resp()

monkeypatch.setattr(uptime.httpx, "AsyncClient", _Client)
snap = asyncio.run(uptime.get_uptime_snapshot())
assert snap["uptime_30d"] is None
assert snap["uptime_source"] == "unmeasured"
Loading